Affymetrix Processing

Workflow Version: NF_MAAffymetrix_1.0.5

Published

June 26, 2024

1 Validate Parameters

Code
# Ensure requisite package downloads occur in task directory
#   This is necessary since oligo attempts to install an annotations package when loading raw files.
#   This prevent permissions issues for installing and prevents side effects of processing a given dataset. 
#     (i.e. changes to more permanent package installations)
.libPaths(c(getwd(), .libPaths()))

library(dplyr) # Ensure infix operator is available, methods should still reference dplyr namespace otherwise
options(dplyr.summarise.inform = FALSE) # Don't print out '`summarise()` has grouped output by 'group'. You can override using the `.groups` argument.'
if (is.null(params$runsheet)) {
  stop("PARAMETERIZATION ERROR: Must supply runsheet path")
}

runsheet = params$runsheet # <path/to/runsheet>

message(params)

## Set up output structure

# Output Constants
DIR_RAW_DATA <- "00-RawData"
DIR_NORMALIZED_EXPRESSION <- "01-oligo_NormExp"
DIR_DGE <- "02-limma_DGE"

dir.create(DIR_RAW_DATA)
dir.create(DIR_NORMALIZED_EXPRESSION)
dir.create(DIR_DGE)

## Save original par settings
##   Par may be temporarily changed for plotting purposes and reset once the plotting is done

original_par <- par()
options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images

options(timeout=1000)

2 Load Metadata and Raw Data

Code
print("Loading Runsheet...") # NON_DPPD
[1] "Loading Runsheet..."
Code
# Utility function to improve robustness of function calls
# Used to remedy intermittent internet issues during runtime 
retry_with_delay <- function(func, ...) {
  max_attempts = 5
  initial_delay = 10
  delay_increase = 30
  attempt <- 1
  current_delay <- initial_delay
  while (attempt <= max_attempts) {
    result <- tryCatch(
      expr = func(...),
      error = function(e) e
    )
    
    if (!inherits(result, "error")) {
      return(result)
    } else {
      if (attempt < max_attempts) {
        message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)..."))
        Sys.sleep(current_delay)
        current_delay <- current_delay + delay_increase
      } else {
        stop(paste("Max retry attempts reached. Last error:", result$message))
      }
    }
    
    attempt <- attempt + 1
  }
}

df_rs <- read.csv(runsheet, check.names = FALSE) %>% 
          dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character


## Determines the organism specific annotation file to use based on the organism in the runsheet
fetch_organism_specific_annotation_file_path <- function(organism) {
  # Uses the GeneLab GL-DPPD-7110_annotations.csv file to find the organism specific annotation file path
  # Raises an exception if the organism does not have an associated annotation file yet
  

  all_organism_table <- read.csv("https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv")

  annotation_file_path <- all_organism_table %>% dplyr::filter(species == organism) %>% dplyr::pull(genelab_annots_link)

  # Guard clause: Ensure annotation_file_path populated
  # Else: raise exception for unsupported organism
  if (length(annotation_file_path) == 0) {
    stop(glue::glue("Organism supplied '{organism}' is not supported. See the following url for supported organisms: https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv.  Supported organisms will correspond to a row based on the 'species' column and include a url in the 'genelab_annots_link' column of that row"))
  }

  return(annotation_file_path)
}
annotation_file_path <- retry_with_delay(fetch_organism_specific_annotation_file_path, unique(df_rs$organism))

# NON_DPPD:START
print("Here is the embedded runsheet")
[1] "Here is the embedded runsheet"
Code
DT::datatable(df_rs)
Code
print("Here are the expected comparison groups")
[1] "Here are the expected comparison groups"
Code
# NON_DPPD:END
print("Loading Raw Data...") # NON_DPPD
[1] "Loading Raw Data..."
Code
allTrue <- function(i_vector) {
  if ( length(i_vector) == 0 ) {
    stop(paste("Input vector is length zero"))
  }
  all(i_vector)
}

# Define paths to raw data files
runsheetPathsAreURIs <- function(df_runsheet) {
  allTrue(stringr::str_starts(df_runsheet$`Array Data File Path`, "https"))
}


# Download raw data files
downloadFilesFromRunsheet <- function(df_runsheet) {
  urls <- df_runsheet$`Array Data File Path`
  destinationFiles <- df_runsheet$`Array Data File Name`

  mapply(function(url, destinationFile) {
    print(paste0("Downloading from '", url, "' TO '", destinationFile, "'"))
    if ( file.exists(destinationFile ) ) {
      warning(paste( "Using Existing File:", destinationFile ))
    } else {
      download.file(url, destinationFile)
    }
  }, urls, destinationFiles)

  destinationFiles # Return these paths
}


if ( runsheetPathsAreURIs(df_rs) ) {
  print("Determined Raw Data Locations are URIS")
  local_paths <- retry_with_delay(downloadFilesFromRunsheet, df_rs)
} else {
  print("Or Determined Raw Data Locations are local paths")
  local_paths <- df_rs$`Array Data File Path`
}
[1] "Determined Raw Data Locations are URIS"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_14R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_14R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_16R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_16R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_20R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_20R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_52R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_52R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_54R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_54R_(Mouse430_2).CEL.gz'"
[1] "Downloading from 'https://genelab-data.ndc.nasa.gov/geode-py/ws/studies/OSD-87/download?source=datamanager&file=GLDS-87_microarray_58R_(Mouse430_2).CEL.gz' TO 'GLDS-87_microarray_58R_(Mouse430_2).CEL.gz'"
Code
# uncompress files if needed
if ( allTrue(stringr::str_ends(local_paths, ".gz")) ) {
  print("Determined these files are gzip compressed... uncompressing now")
  # This does the uncompression
  lapply(local_paths, R.utils::gunzip, remove = FALSE, overwrite = TRUE)
  # This removes the .gz extension to get the uncompressed filenames
  local_paths <- vapply(local_paths, 
                        stringr::str_replace, # Run this function against each item in 'local_paths'
                        FUN.VALUE = character(1),  # Execpt an character vector as a return
                        USE.NAMES = FALSE,  # Don't use the input to assign names for the returned list
                        pattern = ".gz$", # first argument for applied function
                        replacement = ""  # second argument for applied function
                        )
}
[1] "Determined these files are gzip compressed... uncompressing now"
Code
df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths` = local_paths, check.names = FALSE)
# NON_DPPD:START
print("Raw Data Loaded Successfully")
[1] "Raw Data Loaded Successfully"
Code
DT::datatable(df_local_paths)
Code
# NON_DPPD:END


# Load raw data into R object
# Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to load
raw_data <- retry_with_delay(
              oligo::read.celfiles,
                df_local_paths$`Local Paths`,
                sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames)
              )
Reading in : GLDS-87_microarray_14R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_16R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_20R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_52R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_54R_(Mouse430_2).CEL
Reading in : GLDS-87_microarray_58R_(Mouse430_2).CEL
Code
print(str(raw_data))
Formal class 'ExpressionFeatureSet' [package "oligoClasses"] with 9 slots
  ..@ manufacturer     : chr "Affymetrix"
  ..@ intensityFile    : chr NA
  ..@ assayData        :<environment: 0x55761df7f4a8> 
  ..@ phenoData        :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
  .. .. ..@ varMetadata      :'data.frame': 1 obs. of  2 variables:
  .. .. .. ..$ labelDescription: chr "Index"
  .. .. .. ..$ channel         : Factor w/ 2 levels "exprs","_ALL_": 2
  .. .. ..@ data             :'data.frame': 6 obs. of  1 variable:
  .. .. .. ..$ index: int [1:6] 1 2 3 4 5 6
  .. .. ..@ dimLabels        : chr [1:2] "rowNames" "columnNames"
  .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
  .. .. .. .. ..@ .Data:List of 1
  .. .. .. .. .. ..$ : int [1:3] 1 1 0
  .. .. .. .. ..$ names: chr "AnnotatedDataFrame"
  ..@ featureData      :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
  .. .. ..@ varMetadata      :'data.frame': 0 obs. of  1 variable:
  .. .. .. ..$ labelDescription: chr(0) 
  .. .. ..@ data             :'data.frame': 1004004 obs. of  0 variables
  .. .. ..@ dimLabels        : chr [1:2] "featureNames" "featureColumns"
  .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
  .. .. .. .. ..@ .Data:List of 1
  .. .. .. .. .. ..$ : int [1:3] 1 1 0
  .. .. .. .. ..$ names: chr "AnnotatedDataFrame"
  ..@ experimentData   :Formal class 'MIAME' [package "Biobase"] with 13 slots
  .. .. ..@ name             : chr ""
  .. .. ..@ lab              : chr ""
  .. .. ..@ contact          : chr ""
  .. .. ..@ title            : chr ""
  .. .. ..@ abstract         : chr ""
  .. .. ..@ url              : chr ""
  .. .. ..@ pubMedIds        : chr ""
  .. .. ..@ samples          : list()
  .. .. ..@ hybridizations   : list()
  .. .. ..@ normControls     : list()
  .. .. ..@ preprocessing    : list()
  .. .. ..@ other            : list()
  .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
  .. .. .. .. ..@ .Data:List of 2
  .. .. .. .. .. ..$ : int [1:3] 1 0 0
  .. .. .. .. .. ..$ : int [1:3] 1 1 0
  .. .. .. .. ..$ names: chr [1:2] "MIAxE" "MIAME"
  ..@ annotation       : chr "pd.mouse430.2"
  ..@ protocolData     :Formal class 'AnnotatedDataFrame' [package "Biobase"] with 4 slots
  .. .. ..@ varMetadata      :'data.frame': 2 obs. of  2 variables:
  .. .. .. ..$ labelDescription: chr [1:2] "Names of files used in 'exprs'" "Run dates for files used in 'exprs'"
  .. .. .. ..$ channel         : Factor w/ 2 levels "exprs","_ALL_": 2 2
  .. .. ..@ data             :'data.frame': 6 obs. of  2 variables:
  .. .. .. ..$ exprs: chr [1:6] "GLDS-87_microarray_14R_(Mouse430_2).CEL" "GLDS-87_microarray_16R_(Mouse430_2).CEL" "GLDS-87_microarray_20R_(Mouse430_2).CEL" "GLDS-87_microarray_52R_(Mouse430_2).CEL" ...
  .. .. .. ..$ dates: chr [1:6] "02/23/12 11:31:44" "02/23/12 12:04:47" "02/23/12 14:21:33" "02/23/12 11:52:47" ...
  .. .. ..@ dimLabels        : chr [1:2] "rowNames" "columnNames"
  .. .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
  .. .. .. .. ..@ .Data:List of 1
  .. .. .. .. .. ..$ : int [1:3] 1 1 0
  .. .. .. .. ..$ names: chr "AnnotatedDataFrame"
  ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slot
  .. .. ..@ .Data:List of 4
  .. .. .. ..$ : int [1:3] 4 1 1
  .. .. .. ..$ : int [1:3] 2 54 0
  .. .. .. ..$ : int [1:3] 1 3 0
  .. .. .. ..$ : int [1:3] 1 0 0
  .. .. ..$ names: chr [1:4] "R" "Biobase" "eSet" "NChannelSet"
NULL
Code
# Summarize raw data
print("Summarized Raw Data Below") # NON_DPPD
[1] "Summarized Raw Data Below"
Code
print(paste0("Number of Arrays: ", dim(raw_data)[2]))
[1] "Number of Arrays: 6"
Code
print(paste0("Number of Probes: ", dim(raw_data)[1]))
[1] "Number of Probes: 1004004"
Code
message(paste0("Number of Arrays: ", dim(raw_data)[2])) # NON_DPPD
message(paste0("Number of Probes: ", dim(raw_data)[1])) # NON_DPPD
# NON_DPPD:START
DT::datatable(raw_data$targets, caption = "Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END

3 QA For Raw Data

3.1 Density Plot

Code
# Plot settings
par(
  xpd = TRUE # Ensure legend can extend past plot area
)

number_of_sets = ceiling(dim(raw_data)[2] / 30) # Set of 30 samples, used to scale plot

oligo::hist(raw_data, 
            transfo=log2, # Log2 transform raw intensity values
            which=c("all"),
            nsample=10000, # Number of probes to plot
            main = "Density of raw intensities for multiple arrays")
legend("topright", legend = colnames(raw_data@assayData$exprs),
        lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
        col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot
        ncol = number_of_sets, # Set number of columns by number of sets
        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
      )

Density of raw intensities for each array. A lack of overlap indicates a need for normalization.

Code
# Reset par
par(original_par)

3.2 Pseudo Image Plots

Code
for ( i in seq_along(1:ncol(raw_data))) {
  message(glue::glue("Drawing Psuedoimage for {colnames(raw_data)[i]}")) # NON_DPPD
  oligo::image(raw_data[,i], 
    transfo = log2,
    main = colnames(raw_data)[i]
    )
}

3.3 MA Plots

Code
if (inherits(raw_data, "GeneFeatureSet")) {
  print("Raw data is a GeneFeatureSet, using exprs() to access expression values and adding 0.0001 to avoid log(0)")
} else if (inherits(raw_data, "ExpressionSet")) { 
  print("Raw data is an ExpressionSet. Using default approach for this class for MA Plot")
} else if (inherits(raw_data, "ExpressionFeatureSet")) { 
  print("Raw data is an ExpressionFeatureSet. Using default approach for this class for MA Plot")
}
[1] "Raw data is an ExpressionFeatureSet. Using default approach for this class for MA Plot"
  • M = Expression log-ratio (this sample vs. pseudo median reference chip)
  • A = Average log-expression
Code
if (inherits(raw_data, "GeneFeatureSet")) {
  MA_plot <- oligo::MAplot(
    exprs(raw_data) + 0.0001,
    transfo=log2,
    ylim=c(-2, 4),
    main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
  )
} else if (inherits(raw_data, "ExpressionSet")) { 
  MA_plot <- oligo::MAplot(
    raw_data,
    ylim=c(-2, 4),
    main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
  )
} else if (inherits(raw_data, "ExpressionFeatureSet")) { 
  MA_plot <- oligo::MAplot(
    raw_data,
    ylim=c(-2, 4),
    main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
  )
} else {
  stop(glue::glue("No strategy for MA plots for {raw_data}"))
}

3.4 Boxplots

Code
max_samplename_length <- max(nchar(colnames(raw_data)))
dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)
par(
  mar = c(8, dynamic_lefthand_margin, 8, 2) + 0.1, # mar is the margin around the plot. c(bottom, left, top, right)
  xpd = TRUE
  ) 
boxplot <- oligo::boxplot(raw_data[, rev(colnames(raw_data))], # Here we reverse column order to ensure descending order for samples in horizontal boxplot 
                          transfo=log2, # Log2 transform raw intensity values
                          which=c("all"),
                          nsample=10000, # Number of probes to plot
                          las = 1, # las specifies the orientation of the axis labels. 1 = always horizontal
                          ylab="",
                          xlab="log2 Intensity",
                          main = "Boxplot of raw intensities \nfor perfect match and mismatch probes",
                          horizontal = TRUE
                          )
title(ylab = "Sample Name", mgp = c(dynamic_lefthand_margin-2, 1, 0))

Code
# Reset par
par(original_par)

4 Background Correction

Approach reference: https://www.usu.edu/math/jrstevens/stat5570/1.4.Preprocess.pdf

Code
# NON_DPPD:  RMA -> Convolution Background Correction
background_corrected_data <- raw_data %>% oligo::backgroundCorrect(method="rma")

5 Between Array Normalization

Code
# Normalize background-corrected data using the quantile method
norm_data <- oligo::normalize(background_corrected_data, 
                              method = "quantile",
                              target = "core" # Use oligo default: core metaprobeset mappings
                              )
                              
# Summarize background-corrected and normalized data
print("Summarized Normalized Data Below") # NON_DPPD
[1] "Summarized Normalized Data Below"
Code
print(paste0("Number of Arrays: ", dim(norm_data)[2]))
[1] "Number of Arrays: 6"
Code
print(paste0("Number of Probes: ", dim(norm_data)[1]))
[1] "Number of Probes: 1004004"
Code
message(paste0("Number of Arrays: ", dim(norm_data)[2])) # NON_DPPD
message(paste0("Number of Probes: ", dim(norm_data)[1])) # NON_DPPD
# NON_DPPD:START
DT::datatable(raw_data$targets, caption = "Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END

6 QA For Normalized Data

6.1 Density Plot

Code
# Plot settings
par(
  xpd = TRUE # Ensure legend can extend past plot area
)

number_of_sets = ceiling(dim(norm_data)[2] / 30) # Set of 30 samples, used to scale plot

oligo::hist(norm_data, 
            transfo=log2, # Log2 transform normalized intensity values
            which=c("all"),
            nsample=10000, # Number of probes to plot
            main = "Density of normalized intensities for multiple arrays")
legend("topright", legend = colnames(norm_data@assayData$exprs),
        lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
        col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot
        ncol = number_of_sets, # Set number of columns by number of sets
        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1
      )

Density of normalized intensities for each array. Compared to the raw data density plot, array densities should overlap more.

Code
# Reset par
par(original_par)

6.2 Pseudo Image Plots

Code
for ( i in seq_along(1:ncol(norm_data))) {
  message(glue::glue("Drawing Psuedoimage for {colnames(norm_data)[i]}")) # NON_DPPD
  oligo::image(norm_data[,i], 
    transfo = log2,
    main = colnames(norm_data)[i]
    )
}

6.3 MA Plots

  • M = Expression log-ratio (this sample vs. pseudo median reference chip)
  • A = Average log-expression
Code
MA_plot <- oligo::MAplot(
    norm_data, 
    ylim=c(-2, 4),
    main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
)

6.4 Boxplots

Code
max_samplename_length <- max(nchar(colnames(norm_data)))
dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)
par(
  mar = c(8, dynamic_lefthand_margin, 8, 2) + 0.1, # mar is the margin around the plot. c(bottom, left, top, right)
  xpd = TRUE
  ) 
boxplot <- oligo::boxplot(norm_data[, rev(colnames(norm_data))], # Here we reverse column order to ensure descending order for samples in horizontal boxplot
                          transfo=log2, # Log2 transform normalized intensity values
                          which=c("all"),
                          nsample=10000, # Number of probes to plot
                          las = 1, # las specifies the orientation of the axis labels. 1 = always horizontal
                          ylab="",
                          xlab="log2 Intensity",
                          main = "Boxplot of normalized intensities \nfor perfect match and mismatch probes",
                          horizontal = TRUE
                          )
title(ylab = "Sample Name", mgp = c(dynamic_lefthand_margin-2, 1, 0))

Code
# Reset par
par(original_par)

7 Probeset Summarization

Code
# Call RMA but skip normalize and background correction since those have already been applied
probeset_level_data <- oligo::rma(norm_data, 
                                    normalize=FALSE, 
                                    background=FALSE,
                                    )
Calculating Expression
Code
# Summarize background-corrected and normalized data
print("Summarized Probeset Level Data Below") # NON_DPPD
[1] "Summarized Probeset Level Data Below"
Code
print(paste0("Number of Arrays: ", dim(probeset_level_data)[2]))
[1] "Number of Arrays: 6"
Code
print(paste0("Total Number of Probes Assigned To A Probeset: ", dim(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid'])[1])) # man_fsetid means 'Manufacturer Probeset ID'. Ref: https://support.bioconductor.org/p/57191/
[1] "Total Number of Probes Assigned To A Probeset: 496468"
Code
print(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid']))[1])) # man_fsetid means 'Manufacturer Probeset ID'. Ref: https://support.bioconductor.org/p/57191/
[1] "Number of Probesets: 45101"
Code
message(paste0("Number of Arrays: ", dim(probeset_level_data)[2])) # NON_DPPD
message(paste0("Total Number of Probes Assigned To A Probeset: ", dim(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid'])[1])) # NON_DPPD
message(paste0("Number of Probesets: ", dim(unique(oligo::getProbeInfo(probeset_level_data, target="core")['man_fsetid']))[1])) # NON_DPPD
# NON_DPPD:START
DT::datatable(raw_data$targets, caption = "Sample to File Mapping")
Code
DT::datatable(head(raw_data$genes, n = 20), caption = "First 20 rows of raw data file embedded probes to genes table")
Code
# NON_DPPD:END

8 Perform Probeset Differential Expression and Annotation

8.1 Probeset Differential Expression (DE)

8.1.1 Add Probeset Annotations

Code
shortenedOrganismName <- function(long_name) {
  #' Convert organism names like 'Homo Sapiens' into 'hsapiens'
  tokens <- long_name %>% stringr::str_split(" ", simplify = TRUE)
  genus_name <- tokens[1]

  species_name <- tokens[2]

  short_name <- stringr::str_to_lower(paste0(substr(genus_name, start = 1, stop = 1), species_name))

  return(short_name)
}

getBioMartAttribute <- function(df_rs) {
  #' Returns resolved biomart attribute source from runsheet
  # NON_DPPD:START
  #' this either comes from the runsheet or as a fall back, the parameters injected during render
  #' if neither exist, an error is thrown
  # NON_DPPD:END

  # check if runsheet has 'biomart_attribute' column
  if ( !is.null(df_rs$`biomart_attribute`) ) {
    print("Using attribute name sourced from runsheet")
    # Format according to biomart needs
    formatted_value <- unique(df_rs$`biomart_attribute`) %>% 
                        stringr::str_replace_all(" ","_") %>% # Replace all spaces with underscore
                        stringr::str_to_lower() # Lower casing only
    return(formatted_value)
  } else {
    stop("ERROR: Could not find 'biomart_attribute' in runsheet")
  }
}

get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) {
  #' Obtain mapping table directly from ftp.  Useful when biomart live service no longer exists for desired version
  
  request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz")

  print(glue::glue("Mappings file URL: {request_url}"))

  # Create a temporary file name
  temp_file <- tempfile(fileext = ".gz")

  # Download the gzipped table file using the download.file function
  download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps

  # Uncompress the file
  uncompressed_temp_file <- tempfile()
  gzcon <- gzfile(temp_file, "rt")
  content <- readLines(gzcon)
  writeLines(content, uncompressed_temp_file)
  close(gzcon)


  # Load the data into a dataframe
  mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file
                        # Add column names as follows: MAPID, TAIR, PROBESETID
                        col.names = c("MAPID", "ensembl_gene_id", biomart_attribute),
                        header = FALSE, # No header in original table
                        sep = "\t") # Tab separated

  # Clean up temporary files
  unlink(temp_file)
  unlink(uncompressed_temp_file)

  return(mapping)
}


organism <- shortenedOrganismName(unique(df_rs$organism))

if (organism %in% c("athaliana")) {
  ensembl_genomes_version = "54"
  ensembl_genomes_portal = "plants"
  print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}"))
  expected_attribute_name <- getBioMartAttribute(df_rs)
  df_mapping <- retry_with_delay(
    get_ensembl_genomes_mappings_from_ftp,
      organism = organism,
      ensembl_genomes_portal = ensembl_genomes_portal,
      ensembl_genomes_version = ensembl_genomes_version,
      biomart_attribute = expected_attribute_name
    )

  # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
  # So here we remove the '.NNN' from the mapping table where .NNN is any number
  df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "")
} else {
  # Use biomart from main Ensembl website which archives keep each release on the live service
  # locate dataset
  expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
  print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
  message(paste0("Expected dataset name: '", expected_dataset_name, "'")) # NON_DPPD


  # Specify Ensembl version used in current GeneLab reference annotations
  ENSEMBL_VERSION <- '107'
  print(paste0("Searching for Ensembl Version: ", ENSEMBL_VERSION)) # NON_DPPD

  print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}"))

  ensembl <- biomaRt::useEnsembl(biomart = "genes", 
                                dataset = expected_dataset_name,
                                version = ENSEMBL_VERSION)
  print(ensembl)

  expected_attribute_name <- getBioMartAttribute(df_rs)
  print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
  message(paste0("Expected attribute name: '", expected_attribute_name, "'")) # NON_DPPD

  probe_ids <- rownames(probeset_level_data)

  # DEBUG:START
  if ( is.integer(params$DEBUG_limit_biomart_query) ) {
    warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
    message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
    probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
  }
  # DEBUG:END

  # Create probe map
  # Run Biomart Queries in chunks to prevent request timeouts
  #   Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
  CHUNK_SIZE= 1500
  probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
  df_mapping <- data.frame()
  for (i in seq_along(probe_id_chunks)) {
    probe_id_chunk <- probe_id_chunks[[i]]
    print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
    message(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})")) # NON_DPPD
    chunk_results <- biomaRt::getBM(
        attributes = c(
            expected_attribute_name,
            "ensembl_gene_id"
            ), 
            filters = expected_attribute_name, 
            values = probe_id_chunk, 
            mart = ensembl)

    if (nrow(chunk_results) > 0) {
      df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
    }
    
    Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
  }
}
[1] "Expected dataset name: 'mmusculus_gene_ensembl'"
[1] "Searching for Ensembl Version: 107"
Using Ensembl biomart to get specific version of mapping table. Ensembl version: 107
Warning in listEnsemblArchives(https = FALSE): Ensembl will soon enforce the use of https.
As such the 'https' argument will be deprecated in the next release.
Object of class 'Mart':
  Using the ENSEMBL_MART_ENSEMBL BioMart database
  Using the mmusculus_gene_ensembl dataset[1] "Using attribute name sourced from runsheet"
[1] "Expected attribute name: 'affy_mouse430_2'"
Running biomart query chunk 1 of 31. Total probes IDS in query (1500)
Running biomart query chunk 2 of 31. Total probes IDS in query (1500)
Running biomart query chunk 3 of 31. Total probes IDS in query (1500)
Running biomart query chunk 4 of 31. Total probes IDS in query (1500)
Running biomart query chunk 5 of 31. Total probes IDS in query (1500)
Running biomart query chunk 6 of 31. Total probes IDS in query (1500)
Running biomart query chunk 7 of 31. Total probes IDS in query (1500)
Running biomart query chunk 8 of 31. Total probes IDS in query (1500)
Running biomart query chunk 9 of 31. Total probes IDS in query (1500)
Running biomart query chunk 10 of 31. Total probes IDS in query (1500)
Running biomart query chunk 11 of 31. Total probes IDS in query (1500)
Running biomart query chunk 12 of 31. Total probes IDS in query (1500)
Running biomart query chunk 13 of 31. Total probes IDS in query (1500)
Running biomart query chunk 14 of 31. Total probes IDS in query (1500)
Running biomart query chunk 15 of 31. Total probes IDS in query (1500)
Running biomart query chunk 16 of 31. Total probes IDS in query (1500)
Running biomart query chunk 17 of 31. Total probes IDS in query (1500)
Running biomart query chunk 18 of 31. Total probes IDS in query (1500)
Running biomart query chunk 19 of 31. Total probes IDS in query (1500)
Running biomart query chunk 20 of 31. Total probes IDS in query (1500)
Running biomart query chunk 21 of 31. Total probes IDS in query (1500)
Running biomart query chunk 22 of 31. Total probes IDS in query (1500)
Running biomart query chunk 23 of 31. Total probes IDS in query (1500)
Running biomart query chunk 24 of 31. Total probes IDS in query (1500)
Running biomart query chunk 25 of 31. Total probes IDS in query (1500)
Running biomart query chunk 26 of 31. Total probes IDS in query (1500)
Running biomart query chunk 27 of 31. Total probes IDS in query (1500)
Running biomart query chunk 28 of 31. Total probes IDS in query (1500)
Running biomart query chunk 29 of 31. Total probes IDS in query (1500)
Running biomart query chunk 30 of 31. Total probes IDS in query (1500)
Running biomart query chunk 31 of 31. Total probes IDS in query (101)
Code
# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism
Code
# Convert list of multi-mapped genes to string
listToUniquePipedString <- function(str_list) {
  #! convert lists into strings denoting unique elements separated by '|' characters
  #! e.g. c("GO1","GO2","GO2","G03") -> "GO1|GO2|GO3"
  return(toString(unique(str_list)) %>% stringr::str_replace_all(pattern = stringr::fixed(", "), replacement = "|"))
}

unique_probe_ids <- df_mapping %>% 
                      # note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
                      dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type
                      dplyr::group_by(!!sym(expected_attribute_name)) %>% 
                      dplyr::summarise(
                        ENSEMBL = listToUniquePipedString(ensembl_gene_id)
                        ) %>%
                      # Count number of ensembl IDS mapped
                      dplyr::mutate( 
                        count_ENSEMBL_mappings = 1 + stringr::str_count(ENSEMBL, stringr::fixed("|"))
                      )

probeset_expression_matrix <- oligo::exprs(probeset_level_data)

probeset_expression_matrix.biomart_mapped <- probeset_expression_matrix %>% 
  as.data.frame() %>%
  tibble::rownames_to_column(var = "ProbesetID") %>% # Ensure rownames (probeset IDs) can be used as join key
  dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>%
  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) )

8.2 Summarize Biomart Mapping

Code
# Pie Chart with Percentages
slices <- c(
    'Unique Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 1) %>% dplyr::distinct(ProbesetID)), 
    'Multi Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings > 1) %>% dplyr::distinct(ProbesetID)), 
    'No Mapping' = nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::filter(count_ENSEMBL_mappings == 0) %>% dplyr::distinct(ProbesetID))
)
pct <- round(slices/sum(slices)*100)
chart_names <- names(slices)
chart_names <- glue::glue("{names(slices)} ({slices})") # add count to labels
chart_names <- paste(chart_names, pct) # add percents to labels
chart_names <- paste(chart_names,"%",sep="") # ad % to labels
pie(slices,labels = chart_names, col=rainbow(length(slices)),
    main=glue::glue("Biomart Mapping to Ensembl Primary Keytype\n {nrow(probeset_expression_matrix.biomart_mapped %>% dplyr::distinct(ProbesetID))} Total Unique Probesets")
    )

Code
print(glue::glue("Biomart Unique Mapping Count: {slices[['Unique Mapping']]}"))
Biomart Unique Mapping Count: 33031
Code
message(glue::glue("Biomart Unique Mapping Count: {slices[['Unique Mapping']]}")) # NON_DPPD

8.3 Generate Design Matrix

Code
runsheetToDesignMatrix <- function(runsheet_path) {
    df <- read.csv(runsheet, check.names = FALSE) %>% 
              dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character    # get only Factor Value columns
    factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])
    colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_")
    
    # Load metadata from runsheet csv file
    compare_csv = data.frame(sample_id = df[,c("Sample Name")], factors)

    # Create data frame containing all samples and respective factors
    study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
    colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
    rownames(study) <- compare_csv[,1] 
    
    # Format groups and indicate the group that each sample belongs to
    if (dim(study)[2] >= 2){
        group<-apply(study,1,paste,collapse = " & ") # concatenate multiple factors into one condition per sample
    } else{
        group<-study[,1]
    }
    group_names <- paste0("(",group,")",sep = "") # human readable group names
    group <- sub("^BLOCKER_", "",  make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group namesnames(group) <- group_names
    names(group) <- group_names

    # Format contrasts table, defining pairwise comparisons for all groups
    contrast.names <- combn(levels(factor(names(group))),2) # generate matrix of pairwise group combinations for comparison
    contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "",  make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2)))))
    contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) # format combinations for output table files names
    contrasts <- cbind(contrasts,contrasts[c(2,1),])
    colnames(contrasts) <- contrast.names
    sampleTable <- data.frame(condition=factor(group))
    rownames(sampleTable) <- df[,c("Sample Name")]

    condition <- sampleTable[,'condition']
    names_mapping <- as.data.frame(cbind(safe_name = as.character(condition), original_name = group_names))

    design <- model.matrix(~ 0 + condition)
    design_data <- list( matrix = design, mapping = names_mapping, groups = as.data.frame( cbind(sample = df[,c("Sample Name")], group = group_names) ), contrasts = contrasts )
    return(design_data)
}


# Loading metadata from runsheet csv file
design_data <- runsheetToDesignMatrix(runsheet)
design <- design_data$matrix

# Write SampleTable.csv and contrasts.csv file
write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE)
write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"))

8.4 Perform Individual Probeset Level DE

Code
lmFitPairwise <- function(norm_data, design) {
    #' Perform all pairwise comparisons

    #' Approach based on limma manual section 17.4 (version 3.52.4)

    fit <- limma::lmFit(norm_data, design)

    # Create Contrast Model
    fit.groups <- colnames(fit$design)[which(fit$assign == 1)]
    combos <- combn(fit.groups,2)
    contrasts<-c(paste(combos[1,],combos[2,],sep = "-"),paste(combos[2,],combos[1,],sep = "-")) # format combinations for limma:makeContrasts
    cont.matrix <- limma::makeContrasts(contrasts=contrasts,levels=design)
    contrast.fit <- limma::contrasts.fit(fit, cont.matrix)

    contrast.fit <- limma::eBayes(contrast.fit,trend=TRUE,robust=TRUE)
    return(contrast.fit)
}

# Calculate results
res <- lmFitPairwise(probeset_level_data, design)
DT::datatable(limma::topTable(res)) # NON_DPPD
Code
# Print DE table, without filtering
limma::write.fit(res, adjust = 'BH', 
                file = "INTERIM.csv",
                row.names = FALSE,
                quote = TRUE,
                sep = ",")

8.5 Add Additional Columns and Format DE Table

Code
## Reformat Table for consistency across DE analyses tables within GeneLab ##

# Read in DE table 
df_interim <- read.csv("INTERIM.csv")

# Bind columns from biomart mapped expression table
df_interim <- df_interim %>% 
  dplyr::bind_cols(probeset_expression_matrix.biomart_mapped)

# Reformat column names
reformat_names <- function(colname, group_name_mapping) {
  # NON_DPPD:START
  #! Converts from:
  #!    "P.value.adj.conditionWild.Type...Space.Flight...1st.generation.conditionWild.Type...Ground.Control...4th.generation"
  #! to something like:
  #! "Adj.p.value(Wild Type & Space Flight & 1st generation)v(Wild Type & Ground Control & 4th generation)"
  #! Since two groups are expected to be replace, ensure replacements happen in pairs

  # Remove 'condition' from group names
  ## This was introduced while creating design matrix
  # Rename other columns for consistency across genomics related DE outputs
  # NON_DPPD:END
  new_colname <- colname  %>% 
                  stringr::str_replace(pattern = "^P.value.adj.condition", replacement = "Adj.p.value_") %>%
                  stringr::str_replace(pattern = "^P.value.condition", replacement = "P.value_") %>%
                  stringr::str_replace(pattern = "^Coef.condition", replacement = "Log2fc_") %>% # This is the Log2FC as per: https://rdrr.io/bioc/limma/man/writefit.html
                  stringr::str_replace(pattern = "^t.condition", replacement = "T.stat_") %>%
                  stringr::str_replace(pattern = ".condition", replacement = "v")
  
  # remap to group names before make.names was applied
  unique_group_name_mapping <- unique(group_name_mapping)
  for ( i in seq(nrow(unique_group_name_mapping)) ) {
    safe_name <- unique_group_name_mapping[i,]$safe_name
    original_name <- unique_group_name_mapping[i,]$original_name
    new_colname <- new_colname %>% stringr::str_replace(pattern = stringr::fixed(safe_name), replacement = original_name)
  }

  return(new_colname)
}

df_interim <- df_interim %>% dplyr::rename_with(reformat_names, .cols = matches('\\.condition'), group_name_mapping = design_data$mapping)


## Add Group Wise Statistics ##

# Group mean and standard deviations for normalized expression values are computed and added to the table

unique_groups <- unique(design_data$group$group)
for ( i in seq_along(unique_groups) ) {
  current_group <- unique_groups[i]
  current_samples <- design_data$group %>% 
                      dplyr::group_by(group) %>%
                      dplyr::summarize(
                        samples = sort(unique(sample))
                      ) %>%
                      dplyr::filter(
                        group == current_group
                      ) %>% 
                      dplyr::pull()
                    
  print(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))
  print(glue::glue("Group: {current_group}"))
  print(glue::glue("Samples in Group: '{toString(current_samples)}'"))
  # NON_DPPD:START
  message(glue::glue("Computing mean and standard deviation for Group {i} of {length(unique_groups)}"))
  message(glue::glue("Group: {current_group}"))
  message(glue::glue("Samples in Group: '{toString(current_samples)}'"))
  # NON_DPPD:END
  
  df_interim <- df_interim %>% 
    dplyr::mutate( 
      "Group.Mean_{current_group}" := rowMeans(dplyr::select(., all_of(current_samples))),
      "Group.Stdev_{current_group}" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(current_samples)))),
      ) %>% 
    dplyr::ungroup() %>%
    as.data.frame()
}
Computing mean and standard deviation for Group 1 of 2
Group: (Ground Control)
Samples in Group: 'Mmus_C57-6CR_RTN_GC_Rep1, Mmus_C57-6CR_RTN_GC_Rep2, Mmus_C57-6CR_RTN_GC_Rep3'
Computing mean and standard deviation for Group 2 of 2
Group: (Space Flight)
Samples in Group: 'Mmus_C57-6CR_RTN_FLT_Rep1, Mmus_C57-6CR_RTN_FLT_Rep2, Mmus_C57-6CR_RTN_FLT_Rep3'
Code
# NON_DPPD:START
## Compute all sample mean and standard deviation
message(glue::glue("Computing mean and standard deviation for all samples"))
# NON_DPPD:END
all_samples <- design_data$group %>% dplyr::pull(sample)
df_interim <- df_interim %>% 
  dplyr::mutate( 
    "All.mean" := rowMeans(dplyr::select(., all_of(all_samples))),
    "All.stdev" := matrixStats::rowSds(as.matrix(dplyr::select(., all_of(all_samples)))),
    ) %>% 
  dplyr::ungroup() %>%
  as.data.frame()

print("Remove extra columns from final table")
[1] "Remove extra columns from final table"
Code
# These columns are data mapped to column PROBEID as per the original Manufacturer and can be linked as needed
colnames_to_remove = c(
  "AveExpr" # Replaced by 'All.mean' column
)

df_interim <- df_interim %>% dplyr::select(-any_of(colnames_to_remove))

## Concatenate annotations for genes (for uniquely mapped probes) ##
### Read in annotation table for the appropriate organism ###
annot <- read.table(
            annotation_file_path,
            sep = "\t",
            header = TRUE,
            quote = "",
            comment.char = "",
        )

# Join annotation table and uniquely mapped data

# Determine appropriate keytype as found in annotation tables
map_primary_keytypes <- c(
  'Caenorhabditis elegans' = 'ENSEMBL',
  'Danio rerio' = 'ENSEMBL',
  'Drosophila melanogaster' = 'ENSEMBL',
  'Rattus norvegicus' = 'ENSEMBL',
  'Saccharomyces cerevisiae' = 'ENSEMBL',
  'Homo sapiens' = 'ENSEMBL',
  'Mus musculus' = 'ENSEMBL',
  'Arabidopsis thaliana' = 'TAIR'
)

df_interim <- merge(
                annot,
                df_interim,
                by.x = map_primary_keytypes[[unique(df_rs$organism)]],
                by.y = "ENSEMBL",
                # ensure all original dge rows are kept.
                # If unmatched in the annotation database, then fill missing with NAN
                all.y = TRUE
            )

## Reorder columns before saving to file
ANNOTATIONS_COLUMN_ORDER = c(
  map_primary_keytypes[[unique(df_rs$organism)]],
  "SYMBOL",
  "GENENAME",
  "REFSEQ",
  "ENTREZID",
  "STRING_id",
  "GOSLIM_IDS"
)

PROBE_INFO_COLUMN_ORDER = c(
  "ProbesetID",
  "count_ENSEMBL_mappings"
)
SAMPLE_COLUMN_ORDER <- all_samples
generate_prefixed_column_order <- function(subjects, prefixes) {
  #' Return a vector of columns based on subject and given prefixes
  #'  Used for both contrasts and groups column name generation
  
  # Track order of columns
  final_order = c()

  # For each contrast
  for (subject in subjects) {
    # Generate column names for each prefix and append to final_order
    for (prefix in prefixes) {
      final_order <- append(final_order, glue::glue("{prefix}{subject}"))
    }
  }
  return(final_order)
}
STAT_COLUMNS_ORDER <- generate_prefixed_column_order(
  subjects = colnames(design_data$contrasts),
  prefixes = c(
    "Log2fc_",
    "T.stat_",
    "P.value_",
    "Adj.p.value_"
    )
  )
ALL_SAMPLE_STATS_COLUMNS_ORDER <- c(
  "All.mean",
  "All.stdev",
  "F",
  "F.p.value"
)

GROUP_MEAN_COLUMNS_ORDER <- generate_prefixed_column_order(
  subjects = unique(design_data$groups$group),
  prefixes = c(
    "Group.Mean_"
    )
  )
GROUP_STDEV_COLUMNS_ORDER <- generate_prefixed_column_order(
  subjects = unique(design_data$groups$group),
  prefixes = c(
    "Group.Stdev_"
    )
  )
FINAL_COLUMN_ORDER <- c(
  ANNOTATIONS_COLUMN_ORDER, 
  PROBE_INFO_COLUMN_ORDER, 
  SAMPLE_COLUMN_ORDER, 
  STAT_COLUMNS_ORDER, 
  ALL_SAMPLE_STATS_COLUMNS_ORDER, 
  GROUP_MEAN_COLUMNS_ORDER,
  GROUP_STDEV_COLUMNS_ORDER
  )

## Assert final column order includes all columns from original table
if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) {
  write.csv(FINAL_COLUMN_ORDER, "FINAL_COLUMN_ORDER.csv")
  NOT_IN_DF_INTERIM <- paste(setdiff(FINAL_COLUMN_ORDER, colnames(df_interim)), collapse = ":::")
  NOT_IN_FINAL_COLUMN_ORDER <- paste(setdiff(colnames(df_interim), FINAL_COLUMN_ORDER), collapse = ":::")
  stop(glue::glue("Column reordering attempt resulted in different sets of columns than original. Names unique to 'df_interim': {NOT_IN_FINAL_COLUMN_ORDER}. Names unique to 'FINAL_COLUMN_ORDER': {NOT_IN_DF_INTERIM}."))
}

## Perform reordering
df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))

# Save to file
write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE)

## Output column subset file with just normalized probeset level expression values
write.csv(
  df_interim[c(
  ANNOTATIONS_COLUMN_ORDER,
  "ProbesetID",
  "count_ENSEMBL_mappings",
  all_samples)
  ], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE)

### Generate and export PCA table for GeneLab visualization plots
PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed
write.csv(PCA_raw$x,
          file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")
          )

## Determine column order for probe level tables

PROBE_INFO_COLUMN_ORDER = c(
  "ProbesetID",
  "ProbeID",
  "count_ENSEMBL_mappings"
)

FINAL_COLUMN_ORDER <- c(
  ANNOTATIONS_COLUMN_ORDER, 
  PROBE_INFO_COLUMN_ORDER, 
  SAMPLE_COLUMN_ORDER
  )

## Generate raw intensity matrix that includes annotations

background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %>% 
  as.data.frame() %>%
  tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key
  dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing
  dplyr::right_join(oligo::getProbeInfo(background_corrected_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid
  dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID
  dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID
  dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>% # Join with biomaRt ENSEMBL mappings
  dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table
  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0
  dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) 

## Perform reordering
background_corrected_data_annotated <- background_corrected_data_annotated %>% 
  dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))

write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE)

## Generate normalized expression matrix that includes annotations
norm_data_matrix_annotated <- oligo::exprs(norm_data) %>% 
  as.data.frame() %>%
  tibble::rownames_to_column(var = "fid") %>% # Ensure rownames (probeset IDs) can be used as join key
  dplyr::mutate(dplyr::across(fid, as.integer)) %>% # Ensure fid is integer type, consistent with getProbeInfo typing
  dplyr::right_join(oligo::getProbeInfo(norm_data), by = "fid") %>% # Add 'man_fsetid' via mapping based on fid
  dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID
  dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID
  dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>%
  dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table
  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0
  dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) 



norm_data_matrix_annotated <- norm_data_matrix_annotated %>% 
  dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))

write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE)

9 Version Reporting

Code
get_versions <- function() {
  clean_url_field <- function(url_vector) {
    # URL field can include multiple entries and newline characters
    #   This helper function extracts just the first url
    # Handle empty fields, populate downstream
    if (is.null(url_vector)) {  
      return("NO URLS ENCODED")
    }
    tryCatch(
    {return(
        (url_vector %>% 
          stringr::str_split(pattern = ",") %>% # Often split on commas
          dplyr::first() %>% # Get first token after comma split
          stringr::str_split(pattern = " ") %>% # Sometimes just spaces to split, e.g. URL: https://github.com/jeroen/curl (devel) https://curl.se/libcurl/  \n(upstream)
          dplyr::first() %>% # Get first token after space split
          stringr::str_replace_all(pattern = "\n", replacement = "") # Never allow newlines, hopefully unlikely after prior steps to isolate first url token
        )[[1]]
      )},
    error = function(cond) {
            print(url_vector)
            stop(cond)
        }
    )
 
  }
  # Note: newlines seem duplicated here as 'glue' trims the first and last newline if they exist
  session_info <- sessionInfo()
  # start with just R version
  versions_buffer <- glue::glue_collapse(c(
    glue::glue("- name: R"),
    glue::glue("  version: {session_info[['R.version']][['major']]}.{session_info[['R.version']][['minor']]}"),
    glue::glue("  homepage: https://www.r-project.org/"),
    glue::glue("  workflow task: PROCESS_AFFYMETRIX")
    ), sep = "\n") 
  # Get 'other attached packages'
  for (software in session_info[["otherPkgs"]]) {
    versions_buffer <- glue::glue_collapse(c(
      versions_buffer,
      glue::glue("- name: {software[['Package']]}"),
      glue::glue("  version: {software[['Version']]}"),
      glue::glue("  homepage: {clean_url_field(software[['URL']])}"),
      glue::glue("  workflow task: PROCESS_AFFYMETRIX")
      ), sep = "\n") 
  }
  # Get 'loaded via a namespace (and not attached):'
  for (software in session_info[["loadedOnly"]]) {
    versions_buffer <- glue::glue_collapse(c(
      versions_buffer,
      glue::glue("- name: {software[['Package']]}"),
      glue::glue("  version: {software[['Version']]}"),
      glue::glue("  homepage: {clean_url_field(software[['URL']])}"),
      glue::glue("  workflow task: PROCESS_AFFYMETRIX")
      ), sep = "\n") 
  }
  return(versions_buffer)
}

## Note Libraries that were NOT used during processing
versions_buffer <- get_versions()

if (organism %in% c("athaliana")) {
  versions_buffer <- glue::glue_collapse(c(
    versions_buffer,
    glue::glue("- name: biomaRt"),
    glue::glue("  version: (Not used for plant datasets)"),
    glue::glue("  homepage: https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html"),
    glue::glue("  workflow task: PROCESS_AFFYMETRIX")
    ), sep = "\n")
}

## Log same info into versions.txt file
version_output_fn <- "versions.yml"
cat(versions_buffer, file = version_output_fn, append = TRUE, sep = "\n")
## Print for report
print("Session Info below: ")
[1] "Session Info below: "
Code
R version 4.1.3 (2022-03-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 18.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.21.so

locale:
[1] C

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] pd.mouse430.2_3.12.0 DBI_1.1.3            oligo_1.58.0        
 [4] Biobase_2.54.0       oligoClasses_1.56.0  RSQLite_2.2.20      
 [7] Biostrings_2.62.0    GenomeInfoDb_1.30.1  XVector_0.34.0      
[10] IRanges_2.28.0       S4Vectors_0.32.4     BiocGenerics_0.40.0 
[13] dplyr_1.0.10        

loaded via a namespace (and not attached):
 [1] bitops_1.0-7                matrixStats_0.63.0         
 [3] bit64_4.0.5                 filelock_1.0.2             
 [5] progress_1.2.2              httr_1.4.4                 
 [7] tools_4.1.3                 bslib_0.4.2                
 [9] utf8_1.2.2                  R6_2.5.1                   
[11] DT_0.26                     affyio_1.64.0              
[13] KernSmooth_2.23-20          withr_2.5.0                
[15] tidyselect_1.2.0            prettyunits_1.1.1          
[17] bit_4.0.5                   curl_4.3.3                 
[19] compiler_4.1.3              preprocessCore_1.56.0      
[21] cli_3.6.0                   xml2_1.3.3                 
[23] DelayedArray_0.20.0         sass_0.4.4                 
[25] rappdirs_0.3.3              stringr_1.5.0              
[27] digest_0.6.31               rmarkdown_2.20             
[29] R.utils_2.12.2              pkgconfig_2.0.3            
[31] htmltools_0.5.4             MatrixGenerics_1.6.0       
[33] limma_3.50.3                dbplyr_2.2.1               
[35] fastmap_1.1.0               htmlwidgets_1.6.1          
[37] rlang_1.0.6                 jquerylib_0.1.4            
[39] generics_0.1.3              jsonlite_1.8.4             
[41] crosstalk_1.2.0             R.oo_1.25.0                
[43] RCurl_1.98-1.9              magrittr_2.0.3             
[45] GenomeInfoDbData_1.2.7      Matrix_1.5-3               
[47] Rcpp_1.0.9                  fansi_1.0.3                
[49] lifecycle_1.0.3             R.methodsS3_1.8.2          
[51] stringi_1.7.12              yaml_2.3.6                 
[53] SummarizedExperiment_1.24.0 zlibbioc_1.40.0            
[55] BiocFileCache_2.2.0         grid_4.1.3                 
[57] affxparser_1.66.0           blob_1.2.3                 
[59] crayon_1.5.2                lattice_0.20-45            
[61] splines_4.1.3               hms_1.1.2                  
[63] KEGGREST_1.34.0             knitr_1.41                 
[65] pillar_1.8.1                GenomicRanges_1.46.1       
[67] codetools_0.2-18            biomaRt_2.50.0             
[69] XML_3.99-0.13               glue_1.6.2                 
[71] evaluate_0.19               BiocManager_1.30.19        
[73] vctrs_0.5.1                 png_0.1-8                  
[75] foreach_1.5.2               purrr_1.0.1                
[77] assertthat_0.2.1            cachem_1.0.6               
[79] xfun_0.36                   ff_4.0.7                   
[81] tibble_3.1.8                iterators_1.0.14           
[83] AnnotationDbi_1.56.2        memoise_2.0.1              
[85] statmod_1.5.0               ellipsis_0.3.2